import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import importlib
import Notebooks.utils.utils as utils
import Notebooks.utils.da_utils as da_utils
import src.ds_utils.dataset as dataset
import src.models.boost_ensemble.ensemble_wrapper as ensemble_wrapper
import src.models.shared.shap_analysis as shap
sns.set_theme(style="darkgrid", palette="pastel")
plt.style.use("fivethirtyeight")
VERBOSE = True
utils.fix_cwd()
importlib.reload(dataset)
ds = dataset.create_dataset(cache_path="dataset_cache/anonymize_cached_44898_default_80.csv")
if VERBOSE:
display(ds.full_data.sample(200))
| title | text | subject | date | fake | extracted_features | anonymized_text | num_entities | num_person_entities | num_org_entities | ... | gpe_Georgia | gpe_Egypt | gpe_Benghazi | gpe_Italy | gpe_Sweden | gpe_Philippines | gpe_Oregon | gpe_Baghdad | gpe_Orlando | gpe_Illinois | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15632 | UPDATE: CATHOLIC UNIVERSITY THAT STATUE REMOVE... | This is a perfect example of the effort by lib... | politics | May 27, 2015 | 1 | {'entities': [('American', 'NORP'), ('Indians'... | This is a perfect example of the effort by lib... | 28 | 5 | 8 | ... | False | False | False | False | False | False | False | False | False | False |
| 29490 | Chelsea Manning criticizes Obama, draws Trump'... | WASHINGTON (Reuters) - Former U.S. military in... | politicsNews | January 26, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_GPE_0}', 'GPE')... | {RANDOM_ENTITY_GPE_0} ({RANDOM_ENTITY_ORG_1}) ... | 51 | 23 | 6 | ... | False | False | False | False | False | False | False | False | False | False |
| 22336 | WAR ON WORDS: Facebook Censorship Widens, Webs... | 21st Century Wire says The US media s neoMcCar... | US_News | December 3, 2016 | 1 | {'entities': [('{RANDOM_ENTITY_GPE_0}', 'GPE')... | 21st Century Wire says The {RANDOM_ENTITY_GPE_... | 71 | 3 | 37 | ... | False | False | False | False | False | False | False | False | False | False |
| 19487 | DEMOCRATS AREN’T AFRAID Trump Will Be A Terrib... | The brilliant, outspoken and beautiful Tomi La... | left-news | Dec 7, 2016 | 1 | {'entities': [('{RANDOM_ENTITY_PERSON_0}', 'PE... | The brilliant, outspoken and beautiful {RANDOM... | 1 | 1 | 0 | ... | False | False | False | False | False | False | False | False | False | False |
| 43007 | Palestinian President Abbas says peace closer ... | NEW YORK (Reuters) - Palestinian President Mah... | worldnews | September 20, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_GPE_0}', 'GPE')... | {RANDOM_ENTITY_GPE_0} ({RANDOM_ENTITY_ORG_1}) ... | 87 | 21 | 7 | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15714 | HAPPY MOTHER’S DAY…Your First Grader Just Rate... | Rate your mom for Mother s Day sounds like the... | politics | May 11, 2015 | 1 | {'entities': [('first', 'ORDINAL'), ('Ridgecre... | Rate your mom for {RANDOM_ENTITY_DATE_2} sound... | 16 | 1 | 1 | ... | False | False | False | False | False | False | False | False | False | False |
| 3926 | Trump Goes COMPLETELY Off The Rails Over Hill... | The right wing is still harping on the FBI s d... | News | November 5, 2016 | 1 | {'entities': [('{RANDOM_ENTITY_ORG_0}', 'ORG')... | The right wing is still harping on the {RANDOM... | 23 | 13 | 4 | ... | False | False | False | False | False | False | False | False | False | False |
| 35350 | Chile election ends era of female presidents i... | SANTIAGO (Reuters) - When Chile s President Mi... | worldnews | December 19, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_ORG_0}', 'ORG')... | {RANDOM_ENTITY_ORG_0} ({RANDOM_ENTITY_ORG_1}) ... | 101 | 29 | 11 | ... | False | False | False | False | False | False | False | False | False | False |
| 3236 | Charlie Sheen Sends SHOCKING Message About Tr... | This year was a rough one and many of us can t... | News | December 29, 2016 | 1 | {'entities': [('{RANDOM_ENTITY_DATE_0}', 'DATE... | {RANDOM_ENTITY_DATE_0} was a rough one and man... | 40 | 26 | 2 | ... | False | False | False | False | False | False | False | False | False | False |
| 38093 | Venezuela's ex-prosecutor wants Maduro tried a... | CARACAS (Reuters) - Venezuela s sacked former ... | worldnews | November 16, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_ORG_0}', 'ORG')... | CARACAS ({RANDOM_ENTITY_ORG_0}) - {RANDOM_ENTI... | 37 | 14 | 5 | ... | False | False | False | False | False | False | False | False | False | False |
200 rows × 289 columns
if VERBOSE:
print(len(ds.full_data.dtypes))
with pd.option_context('display.max_rows', None):
dtype_df = pd.DataFrame({'Column': ds.full_data.dtypes.index, 'Type': ds.full_data.dtypes.values})
display(dtype_df)
# display(ds.full_data.dtypes)
289
| Column | Type | |
|---|---|---|
| 0 | title | object |
| 1 | text | object |
| 2 | subject | object |
| 3 | date | object |
| 4 | fake | int64 |
| 5 | extracted_features | object |
| 6 | anonymized_text | object |
| 7 | num_entities | int64 |
| 8 | num_person_entities | int64 |
| 9 | num_org_entities | int64 |
| 10 | num_gpe_entities | int64 |
| 11 | num_date_entities | int64 |
| 12 | sentiment | float64 |
| 13 | num_words | int64 |
| 14 | num_sents | int64 |
| 15 | avg_word_length | float64 |
| 16 | lexical_diversity | float64 |
| 17 | readability | float64 |
| 18 | num_tokens | int64 |
| 19 | char_count | int64 |
| 20 | ner_density | float64 |
| 21 | max_dep_depth | int64 |
| 22 | avg_token_vector_norm | float64 |
| 23 | avg_sent_length | float64 |
| 24 | median_sent_length | float64 |
| 25 | std_sent_length | float64 |
| 26 | punctuation_density | float64 |
| 27 | stop_words_ratio | float64 |
| 28 | pos_ratio_PROPN | float64 |
| 29 | pos_ratio_AUX | float64 |
| 30 | pos_ratio_ADP | float64 |
| 31 | pos_ratio_INTJ | float64 |
| 32 | pos_ratio_PART | float64 |
| 33 | pos_ratio_VERB | float64 |
| 34 | pos_ratio_ADV | float64 |
| 35 | pos_ratio_SYM | float64 |
| 36 | pos_ratio_NOUN | float64 |
| 37 | pos_ratio_ADJ | float64 |
| 38 | pos_ratio_NUM | float64 |
| 39 | pos_ratio_PRON | float64 |
| 40 | pos_ratio_SPACE | float64 |
| 41 | pos_ratio_X | float64 |
| 42 | pos_ratio_SCONJ | float64 |
| 43 | pos_ratio_DET | float64 |
| 44 | pos_ratio_PUNCT | float64 |
| 45 | pos_ratio_CCONJ | float64 |
| 46 | verb_tense_ratio_past | float64 |
| 47 | verb_tense_ratio_present | int64 |
| 48 | verb_tense_ratio_future | int64 |
| 49 | person_Trump | bool |
| 50 | person_Obama | bool |
| 51 | person_Clinton | bool |
| 52 | person_Hillary | bool |
| 53 | person_Sanders | bool |
| 54 | person_Cruz | bool |
| 55 | person_Putin | bool |
| 56 | person_Ryan | bool |
| 57 | person_Bush | bool |
| 58 | person_Flynn | bool |
| 59 | person_@realDonaldTrump | bool |
| 60 | person_McCain | bool |
| 61 | person_Obamacare | bool |
| 62 | person_Rubio | bool |
| 63 | person_Merkel | bool |
| 64 | person_Donald Trump s | bool |
| 65 | person_James Comey | bool |
| 66 | person_Tillerson | bool |
| 67 | person_Mike Pence | bool |
| 68 | person_Johnson | bool |
| 69 | person_McConnell | bool |
| 70 | person_Moore | bool |
| 71 | person_Kelly | bool |
| 72 | person_Assad | bool |
| 73 | person_Carson | bool |
| 74 | person_Hillary Clinton s | bool |
| 75 | person_Bannon | bool |
| 76 | person_Warren | bool |
| 77 | person_Donald | bool |
| 78 | person_Hariri | bool |
| 79 | person_Sean Spicer | bool |
| 80 | person_Mitt Romney | bool |
| 81 | person_Erdogan | bool |
| 82 | person_Jones | bool |
| 83 | person_Conway | bool |
| 84 | person_Biden | bool |
| 85 | person_Mattis | bool |
| 86 | person_Macron | bool |
| 87 | person_Graham | bool |
| 88 | person_Brown | bool |
| 89 | person_Xi | bool |
| 90 | person_Schumer | bool |
| 91 | person_Kushner | bool |
| 92 | person_Mnuchin | bool |
| 93 | person_Scalia | bool |
| 94 | person_Robert Mueller | bool |
| 95 | person_Soros | bool |
| 96 | person_Manafort | bool |
| 97 | person_Reagan | bool |
| 98 | person_Palin | bool |
| 99 | person_Mugabe | bool |
| 100 | person_Ivanka | bool |
| 101 | person_Kasich | bool |
| 102 | person_Gorsuch | bool |
| 103 | person_Don | bool |
| 104 | person_Jeff Sessions | bool |
| 105 | person_Kennedy | bool |
| 106 | person_Netanyahu | bool |
| 107 | person_Miller | bool |
| 108 | person_Abe | bool |
| 109 | person_Khan | bool |
| 110 | person_Reilly | bool |
| 111 | person_Stein | bool |
| 112 | person_Williams | bool |
| 113 | person_Reid | bool |
| 114 | person_Haley | bool |
| 115 | person_Smith | bool |
| 116 | person_Rice | bool |
| 117 | person_Nancy Pelosi | bool |
| 118 | person_Paul | bool |
| 119 | person_Ross | bool |
| 120 | person_Lee | bool |
| 121 | person_Weinstein | bool |
| 122 | person_Davis | bool |
| 123 | person_Bill | bool |
| 124 | person_Breitbart | bool |
| 125 | person_Carter | bool |
| 126 | person_King | bool |
| 127 | person_John Kerry | bool |
| 128 | person_Garland | bool |
| 129 | org_Reuters | bool |
| 130 | org_the White House | bool |
| 131 | org_Senate | bool |
| 132 | org_Congress | bool |
| 133 | org_FBI | bool |
| 134 | org_State | bool |
| 135 | org_EU | bool |
| 136 | org_CNN | bool |
| 137 | org_the State Department | bool |
| 138 | org_the Supreme Court | bool |
| 139 | org_GOP | bool |
| 140 | org_Fox News | bool |
| 141 | org_Facebook | bool |
| 142 | org_U.N. | bool |
| 143 | org_the New York Times | bool |
| 144 | org_Twitter | bool |
| 145 | org_CIA | bool |
| 146 | org_the Washington Post | bool |
| 147 | org_the United Nations | bool |
| 148 | org_ISIS | bool |
| 149 | org_the House of Representatives | bool |
| 150 | org_NATO | bool |
| 151 | org_the European Union | bool |
| 152 | org_the Republican Party | bool |
| 153 | org_the Democratic Party | bool |
| 154 | org_DNC | bool |
| 155 | org_Commission | bool |
| 156 | org_Comey | bool |
| 157 | org_NBC | bool |
| 158 | org_Pentagon | bool |
| 159 | org_MSNBC | bool |
| 160 | org_Army | bool |
| 161 | org_Fox | bool |
| 162 | org_Kremlin | bool |
| 163 | org_EPA | bool |
| 164 | org_Treasury | bool |
| 165 | org_the Department of Justice | bool |
| 166 | org_Breitbart News | bool |
| 167 | org_the U.N. Security Council | bool |
| 168 | org_Planned Parenthood | bool |
| 169 | org_the Clinton Foundation | bool |
| 170 | org_Medicaid | bool |
| 171 | org_Brexit | bool |
| 172 | org_al Qaeda | bool |
| 173 | org_Hezbollah | bool |
| 174 | org_ABC News | bool |
| 175 | org_Islam | bool |
| 176 | org_Politico | bool |
| 177 | org_Defense | bool |
| 178 | org_Capitol Hill | bool |
| 179 | org_Google | bool |
| 180 | org_Fed | bool |
| 181 | org_NFL | bool |
| 182 | org_Navy | bool |
| 183 | org_Group | bool |
| 184 | org_NSA | bool |
| 185 | org_Taliban | bool |
| 186 | org_ABC | bool |
| 187 | org_CBS | bool |
| 188 | org_WikiLeaks | bool |
| 189 | org_SPD | bool |
| 190 | org_the Democratic National Committee | bool |
| 191 | org_Mueller | bool |
| 192 | org_Justice Department | bool |
| 193 | org_the Wall Street Journal | bool |
| 194 | org_DOJ | bool |
| 195 | org_Daily Mail | bool |
| 196 | org_the Senate Intelligence Committee | bool |
| 197 | org_Getty Images | bool |
| 198 | org_NRA | bool |
| 199 | org_MOSCOW | bool |
| 200 | org_DHS | bool |
| 201 | org_the Department of Homeland Security | bool |
| 202 | org_LePage | bool |
| 203 | org_Confederate | bool |
| 204 | org_the Senate Judiciary Committee | bool |
| 205 | org_RNC | bool |
| 206 | org_Cabinet | bool |
| 207 | org_IRS | bool |
| 208 | org_Secret Service | bool |
| 209 | gpe_U.S. | bool |
| 210 | gpe_the United States | bool |
| 211 | gpe_Washington | bool |
| 212 | gpe_Russia | bool |
| 213 | gpe_America | bool |
| 214 | gpe_China | bool |
| 215 | gpe_Trump | bool |
| 216 | gpe_North Korea | bool |
| 217 | gpe_Iran | bool |
| 218 | gpe_Syria | bool |
| 219 | gpe_New York | bool |
| 220 | gpe_Iraq | bool |
| 221 | gpe_Mexico | bool |
| 222 | gpe_Britain | bool |
| 223 | gpe_Israel | bool |
| 224 | gpe_Texas | bool |
| 225 | gpe_Turkey | bool |
| 226 | gpe_Florida | bool |
| 227 | gpe_Germany | bool |
| 228 | gpe_Moscow | bool |
| 229 | gpe_Myanmar | bool |
| 230 | gpe_California | bool |
| 231 | gpe_Saudi Arabia | bool |
| 232 | gpe_London | bool |
| 233 | gpe_Beijing | bool |
| 234 | gpe_North Carolina | bool |
| 235 | gpe_Japan | bool |
| 236 | gpe_France | bool |
| 237 | gpe_Afghanistan | bool |
| 238 | gpe_Paris | bool |
| 239 | gpe_Virginia | bool |
| 240 | gpe_India | bool |
| 241 | gpe_Jerusalem | bool |
| 242 | gpe_Ohio | bool |
| 243 | gpe_Cuba | bool |
| 244 | gpe_Ukraine | bool |
| 245 | gpe_Yemen | bool |
| 246 | gpe_Libya | bool |
| 247 | gpe_Chicago | bool |
| 248 | gpe_Brussels | bool |
| 249 | gpe_Pyongyang | bool |
| 250 | gpe_Canada | bool |
| 251 | gpe_Australia | bool |
| 252 | gpe_Michigan | bool |
| 253 | gpe_Pakistan | bool |
| 254 | gpe_Puerto Rico | bool |
| 255 | gpe_Iowa | bool |
| 256 | gpe_Lebanon | bool |
| 257 | gpe_Bangladesh | bool |
| 258 | gpe_Arizona | bool |
| 259 | gpe_Taiwan | bool |
| 260 | gpe_Hollywood | bool |
| 261 | gpe_Spain | bool |
| 262 | gpe_UK | bool |
| 263 | gpe_Alabama | bool |
| 264 | gpe_Tehran | bool |
| 265 | gpe_Berlin | bool |
| 266 | gpe_New Jersey | bool |
| 267 | gpe_Wisconsin | bool |
| 268 | gpe_New York City | bool |
| 269 | gpe_D.C. | bool |
| 270 | gpe_Venezuela | bool |
| 271 | gpe_Kansas | bool |
| 272 | gpe_Pennsylvania | bool |
| 273 | gpe_Vietnam | bool |
| 274 | gpe_Poland | bool |
| 275 | gpe_Flint | bool |
| 276 | gpe_New Hampshire | bool |
| 277 | gpe_Catalonia | bool |
| 278 | gpe_Ireland | bool |
| 279 | gpe_Georgia | bool |
| 280 | gpe_Egypt | bool |
| 281 | gpe_Benghazi | bool |
| 282 | gpe_Italy | bool |
| 283 | gpe_Sweden | bool |
| 284 | gpe_Philippines | bool |
| 285 | gpe_Oregon | bool |
| 286 | gpe_Baghdad | bool |
| 287 | gpe_Orlando | bool |
| 288 | gpe_Illinois | bool |
# print(len(ds.full_data))
importlib.reload(da_utils)
da_utils.plot_text_size_distributions(ds.full_data, text_col="text", token_counts=ds.full_data["num_tokens"])
| Unnamed: 0 | title | text | subject | date | fake | extracted_features | anonymized_text | num_entities | num_person_entities | ... | gpe_Pyongyang | gpe_Canada | gpe_Australia | gpe_Michigan | gpe_Pakistan | gpe_Puerto Rico | gpe_Iowa | gpe_Lebanon | gpe_Bangladesh | gpe_Arizona | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 12038 | WHERE’S HILLARY? CLINTON SPOTTED Dining Alone | politics | Dec 30, 2016 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 41 | 12750 | WHOA! RUSH LIMBAUGH RIPS Into Republicans Who ... | politics | Oct 12, 2016 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 177 | 13008 | HILLARY TAKES CREDIT For The Arab Spring Disas... | politics | Sep 15, 2016 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 324 | 11478 | CHILLING! FOX REPORTER JAMES ROSEN Recounts Be... | politics | Mar 6, 2017 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 400 | 11854 | LIVE FEED: INAUGURATION 2017! | politics | Jan 20, 2017 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44632 | 12183 | TUCKER CARLSON DESTROYS Smug Elector Who Refus... | https://youtu.be/7oOhwHG2Gb4 | politics | Dec 9, 2016 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | https://youtu.be/7oOhwHG2Gb4 | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False |
| 44782 | 19115 | TUCKER CARLSON Defends Trump On Sweden Comment... | left-news | Feb 20, 2017 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 44868 | 19118 | AWESOME! HISPANIC TRUMP SUPPORTER Rips Into Pr... | left-news | Feb 19, 2017 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 44878 | 19769 | DEMOCRAT OPERATIVES Caught Planning To Bully W... | left-news | Oct 20, 2016 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 44893 | 11284 | UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He... | politics | Mar 27, 2017 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False |
836 rows × 169 columns
importlib.reload(da_utils)
da_utils.plot_stacked_kde_text_length_vs_target(df=ds.full_data, len_column="num_tokens", target_column="fake")
<Axes: title={'center': 'Text Size (Spacy Token Count)'}, xlabel='Text Length', ylabel='Density'>
1.4 Entity Analysis¶
# 3. Proportion of "fake" for binary columns (prefixed with person_, org_, gpe_, etc.) including count
binary_columns = ds.full_data.columns[ds.full_data.columns.str.startswith(('person_', 'org_', 'gpe_'))]
fake_proportion_binary = ds.full_data[binary_columns].mean().reset_index()
fake_proportion_binary.columns = ['binary_column', 'total_proportion']
fake_proportion_binary['total_count'] = ds.full_data[binary_columns].sum().values
d = []
for col in binary_columns:
d.append({
'col': col,
'total': len(ds.full_data[ds.full_data[col] == True]),
'fake': len(ds.full_data[(ds.full_data[col] == True) & (ds.full_data["fake"] == True)]),
'true': len(ds.full_data[(ds.full_data[col] == True) & (ds.full_data["fake"] == False)])
})
d = pd.DataFrame(d)
d["prop_fake"] = d["fake"] / d["total"]
d.sort_values(by="total", ascending=False)
| col | total | fake | true | prop_fake | |
|---|---|---|---|---|---|
| 0 | person_Trump | 21541 | 11981 | 9560 | 0.556195 |
| 80 | org_Reuters | 21531 | 302 | 21229 | 0.014026 |
| 15 | person_Donald Trump s | 18051 | 8873 | 9178 | 0.491552 |
| 10 | person_@realDonaldTrump | 16995 | 7961 | 9034 | 0.468432 |
| 160 | gpe_U.S. | 16027 | 4671 | 11356 | 0.291446 |
| ... | ... | ... | ... | ... | ... |
| 61 | person_Reilly | 217 | 206 | 11 | 0.949309 |
| 53 | person_Gorsuch | 217 | 79 | 138 | 0.364055 |
| 226 | gpe_Flint | 201 | 145 | 56 | 0.721393 |
| 50 | person_Mugabe | 192 | 4 | 188 | 0.020833 |
| 140 | org_SPD | 179 | 6 | 173 | 0.033520 |
240 rows × 5 columns
total_samples = len(ds.full_data)
overall_fake_proportion = ds.full_data['fake'].mean()
d['total_normalized'] = d['total'] / total_samples
d['prop_fake_normalized'] = d['prop_fake'] - overall_fake_proportion
# Create importance score (you can adjust the weights if needed)
d['importance_score'] = 0.5 * d['total_normalized'] + 0.5 * abs(d['prop_fake_normalized'])
# Sort by importance score
d_sorted = d.sort_values('importance_score', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='importance_score', y='col', data=d_sorted.head(20))
plt.title('Top 20 Entities by Importance Score')
plt.xlabel('Importance Score')
plt.ylabel('Entity')
plt.tight_layout()
plt.show()
The importance score combines two factors: how often an entity appears (frequency) and how strongly it's associated with fake news (predictive power). It's calculated by normalizing and averaging these two measures. A higher score indicates that an entity is both common and a good predictor of fake news. This helps identify the most relevant entities for distinguishing between fake and real news articles.
plt.figure(figsize=(12, 8))
sns.scatterplot(x='total', y='prop_fake', data=d, hue='importance_score', size='importance_score', sizes=(20, 200))
plt.title('Total Count vs. Proportion of Fake News')
plt.xlabel('Total Count')
plt.ylabel('Proportion of Fake News')
for i, row in d_sorted.head(10).iterrows():
plt.annotate(row['col'], (row['total'], row['prop_fake']))
plt.tight_layout()
plt.show()
top_30 = d_sorted.head(30).set_index('col')
plt.figure(figsize=(12, 10))
sns.heatmap(top_30[['total_normalized', 'prop_fake_normalized']], annot=True, cmap='YlOrRd')
plt.title('Top 30 Entities: Normalized Total Count and Fake News Proportion')
plt.tight_layout()
plt.show()
# entity_type_summary = d.groupby('entity_type').agg({
# 'total': 'sum',
# 'prop_fake': 'mean',
# 'importance_score': 'mean'
# }).sort_values('importance_score', ascending=False)
#
# entity_type_summary
correlation_matrix = d[['total', 'prop_fake', 'importance_score']].corr()
correlation_matrix
| total | prop_fake | importance_score | |
|---|---|---|---|
| total | 1.000000 | -0.063512 | 0.388932 |
| prop_fake | -0.063512 | 1.000000 | -0.277533 |
| importance_score | 0.388932 | -0.277533 | 1.000000 |
1.4.1 PCA¶
numeric_feats = ds.full_data.select_dtypes(include=[np.number])
# float_columns = df.select_dtypes(include=['float'])
numeric_feats
| fake | num_entities | num_person_entities | num_org_entities | num_gpe_entities | num_date_entities | sentiment | num_words | num_sents | avg_word_length | ... | pos_ratio_PRON | pos_ratio_SPACE | pos_ratio_X | pos_ratio_SCONJ | pos_ratio_DET | pos_ratio_PUNCT | pos_ratio_CCONJ | verb_tense_ratio_past | verb_tense_ratio_present | verb_tense_ratio_future | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22216 | 1 | 13 | 6 | 4 | 3 | 0 | 0.08333 | 187 | 7 | 4.58824 | ... | 0.01070 | 0.02674 | 0.00000 | 0.01070 | 0.12299 | 0.05348 | 0.04278 | 0.45455 | 0 | 0 |
| 27917 | 0 | 114 | 49 | 28 | 18 | 10 | 0.08660 | 904 | 29 | 4.47898 | ... | 0.04978 | 0.00221 | 0.00000 | 0.00664 | 0.08407 | 0.11394 | 0.02102 | 0.60759 | 0 | 0 |
| 25007 | 0 | 26 | 5 | 4 | 6 | 4 | -0.00504 | 359 | 14 | 4.30084 | ... | 0.06128 | 0.01114 | 0.00000 | 0.01114 | 0.08635 | 0.12813 | 0.01393 | 0.64706 | 0 | 0 |
| 1377 | 1 | 24 | 8 | 1 | 8 | 3 | -0.01116 | 230 | 12 | 4.61739 | ... | 0.08696 | 0.06957 | 0.00000 | 0.01739 | 0.07391 | 0.10870 | 0.01304 | 0.73077 | 0 | 0 |
| 32476 | 0 | 75 | 10 | 15 | 19 | 9 | 0.03935 | 606 | 23 | 4.30363 | ... | 0.06601 | 0.00165 | 0.00000 | 0.01320 | 0.08581 | 0.10561 | 0.01980 | 0.53846 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11284 | 1 | 0 | 0 | 0 | 0 | 0 | 0.00000 | 1 | 1 | 1.00000 | ... | 0.00000 | 1.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0 | 0 |
| 44732 | 0 | 121 | 17 | 20 | 34 | 22 | 0.06335 | 807 | 24 | 4.50310 | ... | 0.03346 | 0.01735 | 0.00000 | 0.01611 | 0.06568 | 0.08674 | 0.02726 | 0.52830 | 0 | 0 |
| 38158 | 0 | 6 | 1 | 1 | 1 | 2 | -0.10606 | 89 | 5 | 4.02247 | ... | 0.11236 | 0.04494 | 0.00000 | 0.01124 | 0.06742 | 0.13483 | 0.02247 | 0.28571 | 0 | 0 |
| 860 | 1 | 111 | 26 | 21 | 1 | 25 | -0.03391 | 1092 | 48 | 3.83700 | ... | 0.06868 | 0.04579 | 0.00183 | 0.02198 | 0.05586 | 0.14011 | 0.01282 | 0.28986 | 0 | 0 |
| 15795 | 1 | 36 | 10 | 13 | 4 | 3 | 0.03011 | 290 | 10 | 4.21724 | ... | 0.03103 | 0.01034 | 0.00000 | 0.00690 | 0.10000 | 0.10345 | 0.01379 | 0.57143 | 0 | 0 |
44898 rows × 43 columns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Drop the 'fake' column if it exists
if 'fake' in numeric_feats.columns:
df = numeric_feats.drop('fake', axis=1)
# Prepare the data for PCA
X = numeric_feats.select_dtypes(include=[np.number])
X_scaled = StandardScaler().fit_transform(X)
# Perform PCA
pca = PCA()
pca.fit(X_scaled)
# Calculate cumulative explained variance ratio
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
# Plot cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance vs. Number of Components')
plt.grid(True)
plt.tight_layout()
plt.show()
# Print the number of components needed to explain 95% of the variance
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
print(f"Number of components needed to explain 95% of the variance: {n_components_95}")
Number of components needed to explain 95% of the variance: 25
from sklearn.manifold import TSNE
# T-SNE
input_df = numeric_feats.copy()
# Drop the 'fake' column if it exists
if 'fake' in input_df.columns:
input_df = input_df.drop('fake', axis=1)
# Prepare the data for t-SNE
features_for_tsne = input_df.select_dtypes(include=[np.number]) # Select only numeric columns
scaled_features = StandardScaler().fit_transform(features_for_tsne)
# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(scaled_features)
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import time
data = numeric_feats.copy()
# Remove the 'fake' column
features = data.drop('fake', axis=1)
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# Function to perform K-means clustering
def perform_kmeans(data, max_clusters=10):
inertias = []
silhouette_scores = []
times = []
for k in range(2, max_clusters + 1):
start_time = time.time()
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
kmeans.fit(data)
end_time = time.time()
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(data, kmeans.labels_))
times.append(end_time - start_time)
return inertias, silhouette_scores, times
# Perform K-means for different numbers of clusters
inertias, silhouette_scores, times = perform_kmeans(scaled_features)
# Plot the elbow curve
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.plot(range(2, len(inertias) + 2), inertias, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
# Plot the silhouette scores
plt.subplot(132)
plt.plot(range(2, len(silhouette_scores) + 2), silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
# Plot the execution times
plt.subplot(133)
plt.plot(range(2, len(times) + 2), times, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Execution Time (seconds)')
plt.title('Performance Analysis')
plt.tight_layout()
plt.show()
# Determine the optimal number of clusters (you may need to adjust this based on the plots)
optimal_clusters = 10 # This is an example, adjust based on your analysis
# Perform final clustering with the optimal number of clusters
final_kmeans = KMeans(n_clusters=optimal_clusters, n_init=10, random_state=42)
cluster_labels = final_kmeans.fit_predict(scaled_features)
# Add cluster labels to the original dataset
data['Cluster'] = cluster_labels
# # Print summary statistics for each cluster
# for cluster in range(optimal_clusters):
# print(f"\nCluster {cluster} Statistics:")
# print(data[data['Cluster'] == cluster].describe())
# Visualize the clusters using the first two principal components (optional)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=cluster_labels, cmap='viridis')
plt.colorbar(scatter)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Cluster Visualization using PCA')
plt.show()
# Create a scatter plot of the t-SNE results
plt.figure(figsize=(10, 8))
scatter = plt.scatter(tsne_results[:, 0],
tsne_results[:, 1],
alpha=0.75,
s=1,
c=cluster_labels,
# c=features_for_tsne['sentiment'],
cmap='viridis')
plt.colorbar(scatter, label='Fake News')
plt.title('t-SNE Visualization of Clusters')
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.tight_layout()
plt.show()
# Print some statistics about the t-SNE results
print("t-SNE Results Shape:", tsne_results.shape)
print("t-SNE Range - X:", tsne_results[:, 0].min(), "to", tsne_results[:, 0].max())
print("t-SNE Range - Y:", tsne_results[:, 1].min(), "to", tsne_results[:, 1].max())
# Optionally, you can save the t-SNE results back to the dataframe
input_df['tsne_1'] = tsne_results[:, 0]
input_df['tsne_2'] = tsne_results[:, 1]
t-SNE Results Shape: (44898, 2) t-SNE Range - X: -94.89072 to 95.122025 t-SNE Range - Y: -95.44623 to 91.66013
USE FASTTEXT EMBEDDINGS (unsupervised) for clustering
SEE:
https://chatgpt.com/c/8bbeec1a-00e9-4840-b874-8cb225ca2fbc
!!
OR BETTER USE BERT EMBEDDINGS
BERT EMBEDDINGS:
T-SNE and CLUSTER
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884 warnings.warn( A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning. A parameter name that contains `beta` will be renamed internally to `bias`. Please use a different name to suppress this warning. A parameter name that contains `gamma` will be renamed internally to `weight`. Please use a different name to suppress this warning.
from contextlib import nullcontext
import torch
from transformers import AlbertTokenizer, AlbertModel
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from torch.cuda.amp import autocast
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# tokenizer = AlbertTokenizer.from_pretrained(model_name)
# model = AlbertModel.from_pretrained(model_name)
class TextDataset(Dataset):
def __init__(self, texts, tokenizer, max_length):
self.texts = texts
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
inputs = self.tokenizer(text, padding="max_length", truncation="longest_first",
max_length=self.max_length, return_tensors="pt")
return {k: v.squeeze(0) for k, v in inputs.items()}
def get_embeddings(texts, model, tokenizer=None, batch_size=32, max_length=512, use_gpu=True, model_type='auto'):
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
model = model.to(device)
if model_type == 'auto':
model = model.half() if use_gpu else model
model.eval()
dataset = TextDataset(texts, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0 if use_gpu else 4)
embeddings = []
with torch.no_grad(), (autocast() if use_gpu else nullcontext()):
for batch in tqdm(dataloader, desc="Generating embeddings"):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
batch_embeddings = outputs.last_hidden_state[:, 0, :]
embeddings.append(batch_embeddings.cpu())
return torch.cat(embeddings).numpy()
elif model_type == 'sentence_transformer':
if isinstance(texts, pd.Series):
texts = texts.tolist()
model.eval()
if use_gpu:
model = model.half() # Use half-precision if on GPU
embeddings = []
with torch.no_grad(), (torch.cuda.amp.autocast() if use_gpu else nullcontext()):
for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
batch = texts[i:i+batch_size]
batch_embeddings = model.encode(batch, device=device, convert_to_tensor=True)
embeddings.append(batch_embeddings.cpu())
return torch.cat(embeddings).numpy()
else:
raise ValueError("Invalid model_type. Choose 'auto' or 'sentence_transformer'.")
# def get_embeddings(texts, batch_size=32):
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)
# model.eval()
#
# embeddings = []
#
# for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
# batch_texts = texts[i:i+batch_size]
#
# # Truncate and tokenize
# truncated_texts = [truncate_from_middle(text, 512) for text in batch_texts]
# inputs = tokenizer(truncated_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
#
# inputs = {k: v.to(device) for k, v in inputs.items()}
#
# with torch.no_grad():
# outputs = model(**inputs)
#
# # Use the [CLS] token embedding as the sentence embedding
# batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
# embeddings.extend(batch_embeddings)
#
# return np.array(embeddings)
def cluster_and_visualize(embeddings, n_clusters=5):
# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)
# Perform t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings)
# Visualize
plt.figure(figsize=(10, 8))
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1],
c=cluster_labels, alpha=0.6, cmap='viridis')
plt.colorbar(scatter)
plt.title('t-SNE visualization of text embeddings')
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.show()
return cluster_labels
from sentence_transformers import SentenceTransformer
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModel.from_pretrained(model_name)
# Usage for SentenceTransformer
model_st = SentenceTransformer('all-MiniLM-L6-v2')
embedding_df = ds.full_data.copy()
embeddings_tinybert = get_embeddings(embedding_df["text"], model, tokenizer, batch_size=256, use_gpu=True, model_type='auto')
Generating embeddings: 3%|▎ | 6/176 [00:27<13:13, 4.67s/it]
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[67], line 3 1 embedding_df = ds.full_data.copy() ----> 3 embeddings_tinybert = get_embeddings(embedding_df["text"], model, tokenizer, batch_size=256, use_gpu=True, model_type='auto') Cell In[62], line 54, in get_embeddings(texts, model, tokenizer, batch_size, max_length, use_gpu, model_type) 52 outputs = model(**batch) 53 batch_embeddings = outputs.last_hidden_state[:, 0, :] ---> 54 embeddings.append(batch_embeddings.cpu()) 56 return torch.cat(embeddings).numpy() 58 elif model_type == 'sentence_transformer': KeyboardInterrupt:
embeddings_st = get_embeddings(embedding_df["text"], model_st, batch_size=256, use_gpu=True, model_type='sentence_transformer')
Generating embeddings: 100%|██████████| 176/176 [04:42<00:00, 1.61s/it]
import gc
gc.collect()
torch.cuda.empty_cache()
embeddings_st
array([[-0.04292417, 0.04017529, 0.00822352, ..., 0.00830901,
0.04013212, -0.05353491],
[-0.0300238 , -0.05997578, 0.09334268, ..., -0.08667795,
-0.02973704, 0.06296595],
[ 0.0077344 , 0.01727735, 0.01518998, ..., -0.06386147,
-0.015207 , -0.02204838],
...,
[-0.00774679, -0.00621239, 0.02480308, ..., -0.05097079,
0.00514519, 0.00038478],
[ 0.06024081, 0.06292548, 0.00465201, ..., -0.00634774,
0.04170959, 0.02873757],
[ 0.00446899, -0.08325394, -0.00282552, ..., -0.16707481,
0.09308673, -0.04893658]], dtype=float32)
# Copyimport numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score
def cluster_and_visualize(embeddings, min_cluster_size=500, min_samples=5):
# HDBSCAN Clustering
clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
cluster_labels = clusterer.fit_predict(embeddings)
# Calculate silhouette score
silhouette_avg = silhouette_score(embeddings, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")
# t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)
# Plotting
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='viridis')
plt.colorbar(scatter)
plt.title(f'HDBSCAN Clustering of Text Embeddings (Silhouette Score: {silhouette_avg:.2f})')
plt.xlabel('t-SNE feature 0')
plt.ylabel('t-SNE feature 1')
plt.show()
return cluster_labels
cluster_labels = cluster_and_visualize(embeddings_st)
Pipeline options:
1. Use spacy to summarize?
1.5 Temporal Analysis¶
importlib.reload(da_utils)
da_utils.sample_plot_by_time(ds.full_data)
<Figure size 1900x1100 with 0 Axes>
We can see that the dataset is not at all balanced {time wise} which is highly problematic because {}
| Unnamed: 0 | title | text | subject | date | fake | extracted_features | anonymized_text | num_entities | num_person_entities | ... | gpe_Pyongyang | gpe_Canada | gpe_Australia | gpe_Michigan | gpe_Pakistan | gpe_Puerto Rico | gpe_Iowa | gpe_Lebanon | gpe_Bangladesh | gpe_Arizona | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22216 | Ben Stein Calls Out 9th Circuit Court: Committ... | 21st Century Wire says Ben Stein, reputable pr... | US_News | February 13, 2017 | 1 | {'entities': [('{RANDOM_ENTITY_PERSON_0}', 'PE... | 21st Century Wire says {RANDOM_ENTITY_PERSON_0... | 13 | 6 | ... | False | False | False | False | False | False | False | False | False | False |
| 1 | 27917 | Trump drops Steve Bannon from National Securit... | WASHINGTON (Reuters) - U.S. President Donald T... | politicsNews | April 5, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_GPE_0}', 'GPE')... | {RANDOM_ENTITY_GPE_0} ({RANDOM_ENTITY_ORG_1}) ... | 114 | 49 | ... | False | False | False | False | False | False | False | False | False | False |
| 2 | 25007 | Puerto Rico expects U.S. to lift Jones Act shi... | (Reuters) - Puerto Rico Governor Ricardo Rosse... | politicsNews | September 27, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_ORG_0}', 'ORG')... | ({RANDOM_ENTITY_ORG_0}) - {RANDOM_ENTITY_GPE_1... | 26 | 5 | ... | False | False | False | False | False | True | False | False | False | False |
| 3 | 1377 | OOPS: Trump Just Accidentally Confirmed He Le... | On Monday, Donald Trump once again embarrassed... | News | May 22, 2017 | 1 | {'entities': [('{RANDOM_ENTITY_DATE_0}', 'DATE... | On {RANDOM_ENTITY_DATE_0}, {RANDOM_ENTITY_PERS... | 24 | 8 | ... | False | False | False | False | False | False | False | False | False | False |
| 4 | 32476 | Donald Trump heads for Scotland to reopen a go... | GLASGOW, Scotland (Reuters) - Most U.S. presid... | politicsNews | June 24, 2016 | 0 | {'entities': [('{RANDOM_ENTITY_GPE_0}', 'GPE')... | GLASGOW, {RANDOM_ENTITY_GPE_0} ({RANDOM_ENTITY... | 75 | 10 | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44893 | 11284 | UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He... | politics | Mar 27, 2017 | 1 | {'entities': [], 'entity_counts': {}, 'person_... | 0 | 0 | ... | False | False | False | False | False | False | False | False | False | False | ||
| 44894 | 44732 | PM May seeks to ease Japan's Brexit fears duri... | LONDON/TOKYO (Reuters) - British Prime Ministe... | worldnews | August 29, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_GPE_0}', 'GPE')... | {RANDOM_ENTITY_GPE_0}/TOKYO ({RANDOM_ENTITY_OR... | 121 | 17 | ... | False | False | False | False | False | False | False | False | False | False |
| 44895 | 38158 | Merkel: Difficult German coalition talks can r... | BERLIN (Reuters) - Chancellor Angela Merkel sa... | worldnews | November 16, 2017 | 0 | {'entities': [('{RANDOM_ENTITY_GPE_0}', 'GPE')... | {RANDOM_ENTITY_GPE_0} ({RANDOM_ENTITY_ORG_1}) ... | 6 | 1 | ... | False | False | False | False | False | False | False | False | False | False |
| 44896 | 860 | Trump Stole An Idea From North Korean Propaga... | Jesus f*cking Christ our President* is a moron... | News | July 14, 2017 | 1 | {'entities': [('{RANDOM_ENTITY_PERSON_0}', 'PE... | {RANDOM_ENTITY_PERSON_0} f*cking Christ our Pr... | 111 | 26 | ... | False | False | False | False | False | False | False | False | False | False |
| 44897 | 15795 | BREAKING: HILLARY CLINTON’S STATE DEPARTMENT G... | IF SHE S NOT TOAST NOW THEN WE RE IN BIGGER TR... | politics | Apr 23, 2015 | 1 | {'entities': [('{RANDOM_ENTITY_PERSON_0}', 'PE... | IF SHE S NOT TOAST NOW THEN WE RE IN BIGGER TR... | 36 | 10 | ... | False | False | False | False | False | False | False | False | False | False |
44898 rows × 200 columns
3.0 Heuristics Based Model¶
Our goal is not to maximize performance but rather to {find issues in the dataset}:
- Is the model overfitting on some entitites? e.g. Reuters might almost never report "fake" news, but we can't guarantee that won't change in the future. Also, a model that would rely on specific keywords would be trivial to game.
3.1 Feature Selection¶
We will start by dropping all entity features that have > 0.25 normalized importance or higher, this includes these features:
THRESHOLD = 0.25
dropped_cols = d_sorted[d_sorted['prop_fake_normalized'].abs() > THRESHOLD]
print(f"Dropping total columns: {len(dropped_cols)}")
drop_cols = dropped_cols["col"]
dropped_cols.sort_values(by='total', ascending=False)
Dropping total columns: 90
| col | total | fake | true | prop_fake | total_normalized | prop_fake_normalized | importance_score | |
|---|---|---|---|---|---|---|---|---|
| 80 | org_Reuters | 21531 | 302 | 21229 | 0.014026 | 0.479554 | -0.508959 | 0.494256 |
| 162 | gpe_Washington | 10768 | 1973 | 8795 | 0.183228 | 0.239833 | -0.339757 | 0.289795 |
| 164 | gpe_America | 6650 | 5505 | 1145 | 0.827820 | 0.148114 | 0.304834 | 0.226474 |
| 3 | person_Hillary | 2943 | 2764 | 179 | 0.939178 | 0.065549 | 0.416192 | 0.240870 |
| 165 | gpe_China | 2611 | 492 | 2119 | 0.188434 | 0.058154 | -0.334552 | 0.196353 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 154 | org_Confederate | 249 | 201 | 48 | 0.807229 | 0.005546 | 0.284243 | 0.144895 |
| 158 | org_IRS | 234 | 195 | 39 | 0.833333 | 0.005212 | 0.310348 | 0.157780 |
| 61 | person_Reilly | 217 | 206 | 11 | 0.949309 | 0.004833 | 0.426323 | 0.215578 |
| 50 | person_Mugabe | 192 | 4 | 188 | 0.020833 | 0.004276 | -0.502152 | 0.253214 |
| 140 | org_SPD | 179 | 6 | 173 | 0.033520 | 0.003987 | -0.489466 | 0.246726 |
90 rows × 8 columns
from sklearn.model_selection import train_test_split
numeric_and_binary = ds.full_data.select_dtypes(include=[np.number, 'bool'])
X = numeric_and_binary.drop('fake', axis=1) # All columns except 'fake'
y = numeric_and_binary['fake'] # Target column
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_2 = X.drop(columns=drop_cols)
X_train_no_r, X_test_no_r, y_train_no_r, y_test_no_r = train_test_split(X_2, y, test_size=0.2, random_state=42)
print(len(X.columns))
print(len(X_train_no_r.columns))
282 192
importlib.reload(ensemble_wrapper)
tuning_results, tuning_params = ensemble_wrapper.tune_lgbm_model(
X_train=X_train,
y_train=y_train,
X_val=X_test,
y_val=y_test,
n_trials=3, verbose=2)
tuning_results_no_r, tuning_params_no_r = ensemble_wrapper.tune_lgbm_model(
X_train=X_train_no_r,
y_train=y_train_no_r,
X_val=X_test_no_r,
y_val=y_test_no_r,
n_trials=3, verbose=2)
tuning_results
| number | value | datetime_start | datetime_complete | duration | params_bagging_fraction | params_bagging_freq | params_feature_fraction | params_lambda_l1 | params_lambda_l2 | params_learning_rate | params_min_child_samples | params_num_leaves | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 0.999845 | 2024-08-13 18:51:21.834861 | 2024-08-13 18:51:25.769572 | 0 days 00:00:03.934711 | 0.972919 | 6 | 0.118526 | 4.329370e-07 | 4.473429e-07 | 0.133112 | 25 | 159 | COMPLETE |
| 2 | 2 | 0.999713 | 2024-08-13 18:51:25.772550 | 2024-08-13 18:51:29.750344 | 0 days 00:00:03.977794 | 0.362106 | 5 | 0.488751 | 4.258943e-06 | 1.982698e-05 | 0.037521 | 18 | 88 | COMPLETE |
| 0 | 0 | 0.999662 | 2024-08-13 18:51:19.022850 | 2024-08-13 18:51:21.831381 | 0 days 00:00:02.808531 | 0.638793 | 2 | 0.758795 | 3.332365e-08 | 6.245760e-01 | 0.711448 | 19 | 105 | COMPLETE |
best_params = {'num_leaves': 105,
'learning_rate': 0.711447600934342,
'feature_fraction': 0.7587945476302645,
'bagging_fraction': 0.6387926357773329,
'bagging_freq': 2,
'min_child_samples': 19,
'lambda_l1': 3.3323645788192616e-08,
'lambda_l2': 0.6245760287469893,
'value': 0.9996621866873512}
# final_model, _eval_results = ensemble_wrapper.train_lgbm_model(X_train, y_train, X_test, y_test, best_params)
final_model_no_r, _eval_results_no_r = ensemble_wrapper.train_lgbm_model(X_train_no_r, y_train_no_r, X_test_no_r, y_test_no_r, best_params)
Training until validation scores don't improve for 50 rounds [100] valid_0's auc: 0.9976 [200] valid_0's auc: 0.997842 [300] valid_0's auc: 0.997924 [400] valid_0's auc: 0.99803 Early stopping, best iteration is: [402] valid_0's auc: 0.998034 Training LightGBM model took 7.13 seconds
test_metrics = ensemble_wrapper.evaluate_model(final_model_no_r, X_test_no_r, y_test_no_r)
print("Test Metrics:")
test_metrics
Test Metrics:
{'accuracy': 0.9816258351893096,
'precision': 0.9840255591054313,
'recall': 0.9808917197452229,
'f1': 0.9824561403508771,
'auc': 0.998033632164362,
'log_loss': 0.08163107432287664}
importlib.reload(shap)
explainer, shap_values = shap.plot_shap_summary(final_model_no_r, X_test_no_r, plot_type="beeswarm")
/usr/local/lib/python3.10/dist-packages/shap/explainers/_tree.py:448: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')
if VERBOSE:
importlib.reload(ensemble_wrapper)
combined_imp = ensemble_wrapper.get_combined_feature_importance(final_model_no_r, X_train_no_r)
combined_imp_test = ensemble_wrapper.get_combined_feature_importance(final_model_no_r, X_test_no_r)
# feature_imp_df = ensemble_wrapper.get_feature_importances(final_model_no_r, X_test_no_r.columns)
# display(feature_imp_df.sort_values(by=["importance_percentage"], ascending=False).head(25))
/usr/local/lib/python3.10/dist-packages/shap/explainers/_tree.py:448: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')
/usr/local/lib/python3.10/dist-packages/shap/explainers/_tree.py:448: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')
if VERBOSE:
display(combined_imp.sort_values(by=["mean_abs_shap"], ascending=False).head(25))
| feature | importance | importance_percentage | mean_abs_shap | shap_importance_percentage | |
|---|---|---|---|---|---|
| 5 | pos_ratio_PUNCT | 1259 | 3.035344 | 4.402535 | 10.296663 |
| 9 | ner_density | 1204 | 2.902744 | 2.211655 | 5.172626 |
| 2 | verb_tense_ratio_past | 1383 | 3.334298 | 2.019577 | 4.723392 |
| 3 | avg_token_vector_norm | 1364 | 3.288490 | 1.865964 | 4.364123 |
| 16 | punctuation_density | 1123 | 2.707459 | 1.830244 | 4.280579 |
| 1 | std_sent_length | 1397 | 3.368051 | 1.689886 | 3.952311 |
| 8 | pos_ratio_ADV | 1246 | 3.004002 | 1.572365 | 3.677453 |
| 30 | num_date_entities | 746 | 1.798544 | 1.551266 | 3.628105 |
| 25 | num_gpe_entities | 884 | 2.131250 | 1.450836 | 3.393219 |
| 13 | avg_word_length | 1150 | 2.772554 | 1.380218 | 3.228058 |
| 23 | avg_sent_length | 983 | 2.369931 | 1.142517 | 2.672123 |
| 15 | pos_ratio_PRON | 1130 | 2.724336 | 1.127723 | 2.637522 |
| 4 | pos_ratio_SPACE | 1266 | 3.052220 | 1.093518 | 2.557523 |
| 26 | median_sent_length | 881 | 2.124018 | 1.081556 | 2.529546 |
| 22 | pos_ratio_PROPN | 1008 | 2.430204 | 1.071308 | 2.505578 |
| 41 | gpe_U.S. | 73 | 0.175997 | 1.050933 | 2.457926 |
| 28 | num_person_entities | 835 | 2.013115 | 0.928912 | 2.172543 |
| 6 | pos_ratio_PART | 1252 | 3.018468 | 0.849034 | 1.985723 |
| 40 | person_Obama | 86 | 0.207339 | 0.798815 | 1.868271 |
| 43 | person_Trump | 54 | 0.130189 | 0.781697 | 1.828234 |
| 29 | num_org_entities | 802 | 1.933555 | 0.725261 | 1.696243 |
| 32 | num_words | 585 | 1.410386 | 0.652742 | 1.526636 |
| 24 | pos_ratio_NUM | 896 | 2.160181 | 0.648339 | 1.516337 |
| 56 | person_@realDonaldTrump | 41 | 0.098848 | 0.598985 | 1.400909 |
| 12 | pos_ratio_SCONJ | 1173 | 2.828005 | 0.575131 | 1.345118 |
# Plot SHAP dependence for a specific feature
shap.plot_shap_dependence(final_model_no_r, X_test_no_r, "pos_ratio_PUNCT")
/usr/local/lib/python3.10/dist-packages/shap/explainers/_tree.py:448: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[34], line 2 1 # Plot SHAP dependence for a specific feature ----> 2 shap.plot_shap_dependence(final_model_no_r, X_test_no_r, "pos_ratio_PUNCT") File /mnt/v/projects/DL_s4/src/models/shared/shap_analysis.py:67, in plot_shap_dependence(model, X, feature) 64 shap_values = explainer.shap_values(X) 66 plt.figure(figsize=(10, 6)) ---> 67 shap.dependence_plot(feature, shap_values[1], X, show=False) 68 plt.title(f"SHAP Dependence Plot for {feature}") 69 plt.tight_layout() File /usr/local/lib/python3.10/dist-packages/shap/plots/_scatter.py:572, in dependence_legacy(ind, shap_values, features, feature_names, display_features, interaction_index, color, axis_color, cmap, dot_size, x_jitter, alpha, title, xmin, xmax, ax, show, ymin, ymax) 570 if not hasattr(ind, "__len__"): 571 if interaction_index == "auto": --> 572 interaction_index = approximate_interactions(ind, shap_values, features)[0] 573 interaction_index = convert_name(interaction_index, shap_values, feature_names) 574 categorical_interaction = False File /usr/local/lib/python3.10/dist-packages/shap/utils/_general.py:123, in approximate_interactions(index, shap_values, X, feature_names) 121 x = X[inds, index] 122 srt = np.argsort(x) --> 123 shap_ref = shap_values[inds, index] 124 shap_ref = shap_ref[srt] 125 inc = max(min(int(len(x) / 10.0), 50), 1) IndexError: index 37 is out of bounds for axis 1 with size 1
<Figure size 1000x600 with 0 Axes>
3.3 Feature Analysis¶
3.3.1 pos_ratio_PUNCT¶
'pos_ratio_PUNCT' is the ratio of punctuation tokens to the total number of tokens (based on Spacy 'PUNCT' tag). This feature indicates the relative frequency of punctuation in the text, which can be informative about writing style, formality, or the type of content.
da_utils.render_rel_kde_plot(ds.full_data, 'pos_ratio_PUNCT')
importlib.reload(da_utils)
da_utils.corr_strip(ds.full_data, 'pos_ratio_PUNCT')
3.3.2 ner_DENSITY¶
'ner_density' represents the ratio of named entities to the total number of tokens in a text. It's calculated by dividing the count of named entities (like persons, organizations, locations) by the total number of tokens. So basically a higher ner_density indicates that a larger proportion of the text consists of named entities, interestingly "fake" news articles have a generally higher proportion of named entities to other text.
da_utils.render_rel_kde_plot(ds.full_data, 'ner_density')
importlib.reload(da_utils)
da_utils.corr_strip(ds.full_data, 'ner_density')
3.3.3 verb_tense_ratio_past¶
The proportion of verbs in the text that are in the past tense. A higher value indicates more historical or narrative content.
da_utils.render_rel_kde_plot(ds.full_data, 'verb_tense_ratio_past', x_lim=(-0.1,1.1))
da_utils.corr_strip(ds.full_data, 'verb_tense_ratio_past')
3.3.4 avg_token_vector_norm¶
The average magnitude of word vectors in the text. Higher values suggest more semantically rich or specific vocabulary.
da_utils.render_rel_kde_plot(ds.full_data, 'avg_token_vector_norm', x_lim=(40, 80))
da_utils.corr_strip(ds.full_data, 'avg_token_vector_norm')
3.3.5 punctuation_density¶
The ratio of punctuation marks to total characters. Higher values may indicate more complex sentence structures or dialogue-heavy text
da_utils.render_rel_kde_plot(ds.full_data, 'punctuation_density', x_lim=(-0.01, 0.1))
da_utils.corr_strip(ds.full_data, 'punctuation_density')
3.3.6 std_sent_length¶
Standard deviation of sentence lengths. Higher values suggest more variability in sentence structure, potentially indicating more complex writing.
da_utils.render_rel_kde_plot(ds.full_data, 'std_sent_length', x_lim=(-1, 45))
da_utils.corr_strip(ds.full_data, 'std_sent_length')
3.3.7 pos_ratio_ADV¶
The proportion of adverbs in the text. A higher ratio might indicate more descriptive or nuanced language, often used to modify verbs, adjectives, or other adverbs
da_utils.render_rel_kde_plot(ds.full_data, 'pos_ratio_ADV', x_lim=(-0.01, 0.1))
da_utils.corr_strip(ds.full_data, 'pos_ratio_ADV')